// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 %s -O0 -ffreestanding -triple=x86_64-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK64
// RUN: %clang_cc1 %s -O0 -ffreestanding -triple=i386-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=CHECK32

#include <x86intrin.h>

// CHECK64-LABEL: @test_loadiwkey(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
// CHECK64-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
// CHECK64-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    store i32 [[CTL:%.*]], ptr [[CTL_ADDR]], align 4
// CHECK64-NEXT:    store <2 x i64> [[INTKEY:%.*]], ptr [[INTKEY_ADDR]], align 16
// CHECK64-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], ptr [[ENKEY_LO_ADDR]], align 16
// CHECK64-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], ptr [[ENKEY_HI_ADDR]], align 16
// CHECK64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CTL_ADDR]], align 4
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[INTKEY_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ENKEY_LO_ADDR]], align 16
// CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ENKEY_HI_ADDR]], align 16
// CHECK64-NEXT:    store i32 [[TMP0]], ptr [[__CTL_ADDR_I]], align 4
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__INTKEY_ADDR_I]], align 16
// CHECK64-NEXT:    store <2 x i64> [[TMP2]], ptr [[__ENKEY_LO_ADDR_I]], align 16
// CHECK64-NEXT:    store <2 x i64> [[TMP3]], ptr [[__ENKEY_HI_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__INTKEY_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[__ENKEY_LO_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__ENKEY_HI_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__CTL_ADDR_I]], align 4
// CHECK64-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
// CHECK64-NEXT:    ret void
//
// CHECK32-LABEL: @test_loadiwkey(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__CTL_ADDR_I:%.*]] = alloca i32, align 4
// CHECK32-NEXT:    [[__INTKEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__ENKEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__ENKEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[CTL_ADDR:%.*]] = alloca i32, align 4
// CHECK32-NEXT:    [[INTKEY_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[ENKEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[ENKEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    store i32 [[CTL:%.*]], ptr [[CTL_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[INTKEY:%.*]], ptr [[INTKEY_ADDR]], align 16
// CHECK32-NEXT:    store <2 x i64> [[ENKEY_LO:%.*]], ptr [[ENKEY_LO_ADDR]], align 16
// CHECK32-NEXT:    store <2 x i64> [[ENKEY_HI:%.*]], ptr [[ENKEY_HI_ADDR]], align 16
// CHECK32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CTL_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[INTKEY_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ENKEY_LO_ADDR]], align 16
// CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ENKEY_HI_ADDR]], align 16
// CHECK32-NEXT:    store i32 [[TMP0]], ptr [[__CTL_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__INTKEY_ADDR_I]], align 16
// CHECK32-NEXT:    store <2 x i64> [[TMP2]], ptr [[__ENKEY_LO_ADDR_I]], align 16
// CHECK32-NEXT:    store <2 x i64> [[TMP3]], ptr [[__ENKEY_HI_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__INTKEY_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[__ENKEY_LO_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__ENKEY_HI_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__CTL_ADDR_I]], align 4
// CHECK32-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
// CHECK32-NEXT:    ret void
//
void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) {
  _mm_loadiwkey(ctl, intkey, enkey_lo, enkey_hi);
}

// CHECK64-LABEL: @test_encodekey128_u32(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
// CHECK64-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
// CHECK64-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store i32 [[HTYPE:%.*]], ptr [[HTYPE_ADDR]], align 4
// CHECK64-NEXT:    store <2 x i64> [[KEY:%.*]], ptr [[KEY_ADDR]], align 16
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[HTYPE_ADDR]], align 4
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[KEY_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store i32 [[TMP0]], ptr [[__HTYPE_ADDR_I]], align 4
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__KEY_ADDR_I]], align 16
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HTYPE_ADDR_I]], align 4
// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__KEY_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
// CHECK64-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 1
// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
// CHECK64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
// CHECK64-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 1
// CHECK64-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP5]], i32 32
// CHECK64-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP13]], align 1
// CHECK64-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    ret i32 [[TMP15]]
//
// CHECK32-LABEL: @test_encodekey128_u32(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
// CHECK32-NEXT:    [[__KEY_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
// CHECK32-NEXT:    [[KEY_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store i32 [[HTYPE:%.*]], ptr [[HTYPE_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[KEY:%.*]], ptr [[KEY_ADDR]], align 16
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[HTYPE_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[KEY_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store i32 [[TMP0]], ptr [[__HTYPE_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__KEY_ADDR_I]], align 16
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HTYPE_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__KEY_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
// CHECK32-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 1
// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 2
// CHECK32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
// CHECK32-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 1
// CHECK32-NEXT:    [[TMP12:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 3
// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP5]], i32 32
// CHECK32-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP13]], align 1
// CHECK32-NEXT:    [[TMP15:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    ret i32 [[TMP15]]
//
unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
  return _mm_encodekey128_u32(htype, key, h);
}

// CHECK64-LABEL: @test_encodekey256_u32(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
// CHECK64-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
// CHECK64-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store i32 [[HTYPE:%.*]], ptr [[HTYPE_ADDR]], align 4
// CHECK64-NEXT:    store <2 x i64> [[KEY_LO:%.*]], ptr [[KEY_LO_ADDR]], align 16
// CHECK64-NEXT:    store <2 x i64> [[KEY_HI:%.*]], ptr [[KEY_HI_ADDR]], align 16
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[HTYPE_ADDR]], align 4
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[KEY_LO_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[KEY_HI_ADDR]], align 16
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store i32 [[TMP0]], ptr [[__HTYPE_ADDR_I]], align 4
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__KEY_LO_ADDR_I]], align 16
// CHECK64-NEXT:    store <2 x i64> [[TMP2]], ptr [[__KEY_HI_ADDR_I]], align 16
// CHECK64-NEXT:    store ptr [[TMP3]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__HTYPE_ADDR_I]], align 4
// CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[__KEY_LO_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__KEY_HI_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
// CHECK64-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP7]], align 1
// CHECK64-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
// CHECK64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
// CHECK64-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 1
// CHECK64-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
// CHECK64-NEXT:    store <2 x i64> [[TMP14]], ptr [[TMP15]], align 1
// CHECK64-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
// CHECK64-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
// CHECK64-NEXT:    store <2 x i64> [[TMP17]], ptr [[TMP18]], align 1
// CHECK64-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
// CHECK64-NEXT:    ret i32 [[TMP20]]
//
// CHECK32-LABEL: @test_encodekey256_u32(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__HTYPE_ADDR_I:%.*]] = alloca i32, align 4
// CHECK32-NEXT:    [[__KEY_LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__KEY_HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[HTYPE_ADDR:%.*]] = alloca i32, align 4
// CHECK32-NEXT:    [[KEY_LO_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[KEY_HI_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store i32 [[HTYPE:%.*]], ptr [[HTYPE_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[KEY_LO:%.*]], ptr [[KEY_LO_ADDR]], align 16
// CHECK32-NEXT:    store <2 x i64> [[KEY_HI:%.*]], ptr [[KEY_HI_ADDR]], align 16
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[HTYPE_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[KEY_LO_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[KEY_HI_ADDR]], align 16
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store i32 [[TMP0]], ptr [[__HTYPE_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__KEY_LO_ADDR_I]], align 16
// CHECK32-NEXT:    store <2 x i64> [[TMP2]], ptr [[__KEY_HI_ADDR_I]], align 16
// CHECK32-NEXT:    store ptr [[TMP3]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__HTYPE_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[__KEY_LO_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__KEY_HI_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
// CHECK32-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP7]], align 1
// CHECK32-NEXT:    [[TMP11:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 2
// CHECK32-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
// CHECK32-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 1
// CHECK32-NEXT:    [[TMP14:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 3
// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
// CHECK32-NEXT:    store <2 x i64> [[TMP14]], ptr [[TMP15]], align 1
// CHECK32-NEXT:    [[TMP17:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 4
// CHECK32-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
// CHECK32-NEXT:    store <2 x i64> [[TMP17]], ptr [[TMP18]], align 1
// CHECK32-NEXT:    [[TMP20:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 0
// CHECK32-NEXT:    ret i32 [[TMP20]]
//
unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i key_hi, void *h) {
  return _mm_encodekey256_u32(htype, key_lo, key_hi, h);
}

// CHECK64-LABEL: @test_mm_aesenc256kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
// CHECK64:       aesenc256kl_no_error.i:
// CHECK64-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
// CHECK64:       aesenc256kl_error.i:
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
// CHECK64:       _mm_aesenc256kl_u8.exit:
// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    ret i8 [[TMP10]]
//
// CHECK32-LABEL: @test_mm_aesenc256kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC256KL_NO_ERROR_I:%.*]], label [[AESENC256KL_ERROR_I:%.*]]
// CHECK32:       aesenc256kl_no_error.i:
// CHECK32-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT:%.*]]
// CHECK32:       aesenc256kl_error.i:
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENC256KL_U8_EXIT]]
// CHECK32:       _mm_aesenc256kl_u8.exit:
// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    ret i8 [[TMP10]]
//
unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
  return _mm_aesenc256kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test_mm_aesdec256kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
// CHECK64:       aesdec256kl_no_error.i:
// CHECK64-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
// CHECK64:       aesdec256kl_error.i:
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
// CHECK64:       _mm_aesdec256kl_u8.exit:
// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    ret i8 [[TMP10]]
//
// CHECK32-LABEL: @test_mm_aesdec256kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC256KL_NO_ERROR_I:%.*]], label [[AESDEC256KL_ERROR_I:%.*]]
// CHECK32:       aesdec256kl_no_error.i:
// CHECK32-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT:%.*]]
// CHECK32:       aesdec256kl_error.i:
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDEC256KL_U8_EXIT]]
// CHECK32:       _mm_aesdec256kl_u8.exit:
// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    ret i8 [[TMP10]]
//
unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
  return _mm_aesdec256kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test_mm_aesenc128kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
// CHECK64:       aesenc128kl_no_error.i:
// CHECK64-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
// CHECK64:       aesenc128kl_error.i:
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
// CHECK64:       _mm_aesenc128kl_u8.exit:
// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    ret i8 [[TMP10]]
//
// CHECK32-LABEL: @test_mm_aesenc128kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESENC128KL_NO_ERROR_I:%.*]], label [[AESENC128KL_ERROR_I:%.*]]
// CHECK32:       aesenc128kl_no_error.i:
// CHECK32-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT:%.*]]
// CHECK32:       aesenc128kl_error.i:
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENC128KL_U8_EXIT]]
// CHECK32:       _mm_aesenc128kl_u8.exit:
// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    ret i8 [[TMP10]]
//
unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
  return _mm_aesenc128kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test_mm_aesdec128kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK64-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
// CHECK64:       aesdec128kl_no_error.i:
// CHECK64-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
// CHECK64:       aesdec128kl_error.i:
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
// CHECK64:       _mm_aesdec128kl_u8.exit:
// CHECK64-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT:    ret i8 [[TMP10]]
//
// CHECK32-LABEL: @test_mm_aesdec128kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca <2 x i64>, align 16
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store <2 x i64> [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[IDATA_ADDR]], align 16
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store <2 x i64> [[TMP1]], ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], ptr [[TMP5]])
// CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
// CHECK32-NEXT:    br i1 [[TMP8]], label [[AESDEC128KL_NO_ERROR_I:%.*]], label [[AESDEC128KL_ERROR_I:%.*]]
// CHECK32:       aesdec128kl_no_error.i:
// CHECK32-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT:%.*]]
// CHECK32:       aesdec128kl_error.i:
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDEC128KL_U8_EXIT]]
// CHECK32:       _mm_aesdec128kl_u8.exit:
// CHECK32-NEXT:    [[TMP10:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT:    ret i8 [[TMP10]]
//
unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
  return _mm_aesdec128kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test__mm_aesencwide128kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
// CHECK64:       aesencwide128kl_no_error.i:
// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
// CHECK64:       aesencwide128kl_error.i:
// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
// CHECK64:       _mm_aesencwide128kl_u8.exit:
// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    ret i8 [[TMP54]]
//
// CHECK32-LABEL: @test__mm_aesencwide128kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
// CHECK32:       aesencwide128kl_no_error.i:
// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT:%.*]]
// CHECK32:       aesencwide128kl_error.i:
// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENCWIDE128KL_U8_EXIT]]
// CHECK32:       _mm_aesencwide128kl_u8.exit:
// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    ret i8 [[TMP54]]
//
unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
  return _mm_aesencwide128kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test__mm_aesdecwide128kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
// CHECK64:       aesdecwide128kl_no_error.i:
// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
// CHECK64:       aesdecwide128kl_error.i:
// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
// CHECK64:       _mm_aesdecwide128kl_u8.exit:
// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    ret i8 [[TMP54]]
//
// CHECK32-LABEL: @test__mm_aesdecwide128kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
// CHECK32:       aesdecwide128kl_no_error.i:
// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT:%.*]]
// CHECK32:       aesdecwide128kl_error.i:
// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDECWIDE128KL_U8_EXIT]]
// CHECK32:       _mm_aesdecwide128kl_u8.exit:
// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    ret i8 [[TMP54]]
//
unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
  return _mm_aesdecwide128kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test__mm_aesencwide256kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
// CHECK64:       aesencwide256kl_no_error.i:
// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
// CHECK64:       aesencwide256kl_error.i:
// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK64-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
// CHECK64:       _mm_aesencwide256kl_u8.exit:
// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    ret i8 [[TMP54]]
//
// CHECK32-LABEL: @test__mm_aesencwide256kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
// CHECK32:       aesencwide256kl_no_error.i:
// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT:%.*]]
// CHECK32:       aesencwide256kl_error.i:
// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK32-NEXT:    br label [[_MM_AESENCWIDE256KL_U8_EXIT]]
// CHECK32:       _mm_aesencwide256kl_u8.exit:
// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    ret i8 [[TMP54]]
//
unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
  return _mm_aesencwide256kl_u8(odata, idata, h);
}

// CHECK64-LABEL: @test__mm_aesdecwide256kl_u8(
// CHECK64-NEXT:  entry:
// CHECK64-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
// CHECK64-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 8
// CHECK64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 8
// CHECK64-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 8
// CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK64-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK64-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK64-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK64-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK64-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK64-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK64-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK64-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK64-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK64-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
// CHECK64:       aesdecwide256kl_no_error.i:
// CHECK64-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK64-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK64-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK64-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK64-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK64-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK64-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
// CHECK64:       aesdecwide256kl_error.i:
// CHECK64-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK64-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK64-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK64-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK64-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK64-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK64-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK64-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK64-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK64-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK64-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK64-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK64-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK64-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK64-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK64-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK64-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
// CHECK64:       _mm_aesdecwide256kl_u8.exit:
// CHECK64-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT:    ret i8 [[TMP54]]
//
// CHECK32-LABEL: @test__mm_aesdecwide256kl_u8(
// CHECK32-NEXT:  entry:
// CHECK32-NEXT:    [[__ODATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__IDATA_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[__H_ADDR_I:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[ODATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[IDATA_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 4
// CHECK32-NEXT:    store ptr [[ODATA:%.*]], ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[IDATA:%.*]], ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[H:%.*]], ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ODATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IDATA_ADDR]], align 4
// CHECK32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[H_ADDR]], align 4
// CHECK32-NEXT:    store ptr [[TMP0]], ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP1]], ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    store ptr [[TMP2]], ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__IDATA_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__H_ADDR_I]], align 4
// CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 16
// CHECK32-NEXT:    [[TMP7:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 1
// CHECK32-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
// CHECK32-NEXT:    [[TMP9:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 2
// CHECK32-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
// CHECK32-NEXT:    [[TMP11:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 3
// CHECK32-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[TMP11]], align 16
// CHECK32-NEXT:    [[TMP13:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 4
// CHECK32-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
// CHECK32-NEXT:    [[TMP15:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 5
// CHECK32-NEXT:    [[TMP16:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
// CHECK32-NEXT:    [[TMP17:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 6
// CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP17]], align 16
// CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, ptr [[TMP4]], i32 7
// CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, ptr [[TMP19]], align 16
// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
// CHECK32:       aesdecwide256kl_no_error.i:
// CHECK32-NEXT:    [[TMP24:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP25:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP26:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP26]], align 16
// CHECK32-NEXT:    [[TMP27:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP28:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> [[TMP27]], ptr [[TMP28]], align 16
// CHECK32-NEXT:    [[TMP29:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP30:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> [[TMP29]], ptr [[TMP30]], align 16
// CHECK32-NEXT:    [[TMP31:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP32:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> [[TMP31]], ptr [[TMP32]], align 16
// CHECK32-NEXT:    [[TMP33:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP34:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP34]], align 16
// CHECK32-NEXT:    [[TMP35:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP36:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> [[TMP35]], ptr [[TMP36]], align 16
// CHECK32-NEXT:    [[TMP37:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP38:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> [[TMP37]], ptr [[TMP38]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT:%.*]]
// CHECK32:       aesdecwide256kl_error.i:
// CHECK32-NEXT:    [[TMP39:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
// CHECK32-NEXT:    [[TMP40:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 2
// CHECK32-NEXT:    [[TMP41:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 1
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP41]], align 16
// CHECK32-NEXT:    [[TMP42:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 3
// CHECK32-NEXT:    [[TMP43:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 2
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP43]], align 16
// CHECK32-NEXT:    [[TMP44:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 4
// CHECK32-NEXT:    [[TMP45:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 3
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP45]], align 16
// CHECK32-NEXT:    [[TMP46:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 5
// CHECK32-NEXT:    [[TMP47:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 4
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP47]], align 16
// CHECK32-NEXT:    [[TMP48:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 6
// CHECK32-NEXT:    [[TMP49:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 5
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP49]], align 16
// CHECK32-NEXT:    [[TMP50:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 7
// CHECK32-NEXT:    [[TMP51:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 6
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP51]], align 16
// CHECK32-NEXT:    [[TMP52:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 8
// CHECK32-NEXT:    [[TMP53:%.*]] = getelementptr <2 x i64>, ptr [[TMP3]], i32 7
// CHECK32-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP53]], align 16
// CHECK32-NEXT:    br label [[_MM_AESDECWIDE256KL_U8_EXIT]]
// CHECK32:       _mm_aesdecwide256kl_u8.exit:
// CHECK32-NEXT:    [[TMP54:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT:    ret i8 [[TMP54]]
//
unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
  return _mm_aesdecwide256kl_u8(odata, idata, h);
}
